Analyze selection data using soluble Ephrin-B2 or -B3¶
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
#input configs
altair_config = None
nipah_config = None
#input files
func_scores_E2_file = None
binding_E2_file = None
func_scores_E3_file = None
binding_E3_file = None
#output files
filtered_E2_binding_data = None
filtered_E3_binding_data = None
#output images
E2_binding_entry = None
E3_binding_entry = None
E2_binding_heatmap = None
E3_binding_heatmap = None
E2_E3_correlation = None
E2_E3_correlation_site = None
In [2]:
# Parameters
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
binding_E2_file = "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"
filtered_E2_binding_data = "results/filtered_data/E2_binding_filtered.csv"
filtered_E3_binding_data = "results/filtered_data/E3_binding_filtered.csv"
E2_binding_entry = "results/images/E2_binding_entry.html"
E3_binding_entry = "results/images/E3_binding_entry.html"
E2_binding_heatmap = "results/images/E2_binding_heatmap.html"
E3_binding_heatmap = "results/images/E3_binding_heatmap.html"
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
E2_E3_correlation = "results/images/E2_E3_correlation.html"
E2_E3_correlation_site = "results/images/E2_E3_correlation_site.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
Setup in correct directory
Run Config File to Setup Altair theme¶
In [5]:
if altair_config:
with open(altair_config, 'r') as file:
exec(file.read())
In [6]:
with open(nipah_config) as f:
config = yaml.safe_load(f)
Make the E2/E3 dataframes, filter separately, then merge¶
In [7]:
e2 = pd.read_csv(binding_E2_file)
e2_func = pd.read_csv(func_scores_E2_file)
e3 = pd.read_csv(binding_E3_file)
e3_func = pd.read_csv(func_scores_E3_file)
# Pull in monomeric EFNB2 and EFNB2 cell entry scores
#e2 = pd.read_csv('results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv')
#e2_func = pd.read_csv('results/func_effects/averages/CHO_EFNB2_low_func_effects.csv')
# Now do EFNB3
#e3 = pd.read_csv('results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv')
#e3_func = pd.read_csv('results/func_effects/averages/CHO_EFNB3_low_func_effects.csv')
def merge_func_binding_dfs(func,binding,name):
df_int = pd.merge(
binding,
func,
on=['site','mutant','wildtype'],
suffixes=['_affinity','_cell_entry'],
validate='one_to_one',
how='outer'
)
df = df_int.rename(columns={'Ephrin binding_mean':'binding_mean','Ephrin binding_std':'binding_std','Ephrin binding_median':'binding_median'})
#only filter func effects
df_pre_filter = df[
(df['mutant'] != '*') &
(df['mutant'] != '-') &
(df['site'] != 603) &
(df['effect'] >= config['min_func_effect_for_binding']) &
(df['times_seen_cell_entry'] >= config['func_times_seen_cutoff']) &
(df['effect_std'] <= config['func_std_cutoff'])
]
#Now filter binding
df_post_filter = df_pre_filter[
(df_pre_filter['times_seen_affinity'] >= config['min_times_seen_binding']) &
(df_pre_filter['binding_std'] <= config['max_binding_std']) &
(df_pre_filter['frac_models'] >= config['frac_models'])
]
def plot_affinity_corr(df):
if name == 'E2':
color = '#1f4e79'
else:
color = '#ff7f0e'
chart = alt.Chart(df).mark_point(size=30, color='black', opacity=0.2, filled=True).encode(
x=alt.X('effect', title=f'{name} Cell Entry', axis=alt.Axis(grid=True)),
y=alt.Y('binding_median', title=f'{name} Binding', axis=alt.Axis(grid=True)),
tooltip=['site', 'wildtype', 'mutant'],
).properties(
width=alt.Step(10),
height=alt.Step(10),
)
return chart.display()
def plot_times_seen_std(df):
if name == 'E2':
color = '#1f4e79'
else:
color = '#ff7f0e'
chart = alt.Chart(df).mark_circle(size=20,color='black',opacity=0.2).encode(
x=alt.X('times_seen_affinity',axis=alt.Axis(grid=True),title=f'Times Seen for {name}',scale=alt.Scale(type='log')),
y=alt.Y('binding_std',title='Binding Std',axis=alt.Axis(grid=True)),
tooltip=['site','times_seen_affinity','effect_std','effect']
).properties(
height=alt.Step(10),
width=alt.Step(10)
)
return chart.display()
plot_affinity_corr(df)
#entry_vs_binding = plot_affinity_corr(df_post_filter)
#entry_vs_binding.display()
#entry_vs_binding.save(entry_vs_binding)
plot_times_seen_std(df)
#plot_times_seen_std(df_post_filter)
#For pulling out low effect mutants for heatmaps later
def store_filtered_info(df):
df_filter = df[
(df['mutant'] != '*') &
(df['mutant'] != '-') &
(df['site'] != 603) &
(df['effect'] < config['min_func_effect_for_binding']) &
(df['times_seen_cell_entry'] >= config['func_times_seen_cutoff']) &
(df['effect_std'] <= config['func_std_cutoff'])
]
return df_filter
df_low_effect_filter = store_filtered_info(df)
return df_post_filter,df_low_effect_filter
df_E2_filter,df_E2_filter_missing = merge_func_binding_dfs(e2_func,e2,'EFNB2')
df_E3_filter,df_E3_filter_missing = merge_func_binding_dfs(e3_func,e3,'EFNB3')
def plot_corr_binding_entry(df,name):
chart = alt.Chart(df).mark_point(size=30, color='black', opacity=0.2, filled=True).encode(
x=alt.X('effect', title=f'{name} Cell Entry', axis=alt.Axis(grid=True)),
y=alt.Y('binding_median', title=f'{name} Binding', axis=alt.Axis(grid=True)),
tooltip=['site', 'wildtype', 'mutant'],
).properties(
width=alt.Step(10),
height=alt.Step(10),
)
return chart
E2_binding_entry_tmp = plot_corr_binding_entry(df_E2_filter,'EFNB2')
E2_binding_entry_tmp.save(E2_binding_entry)
E3_binding_entry_tmp = plot_corr_binding_entry(df_E3_filter,'EFNB3')
E3_binding_entry_tmp.save(E3_binding_entry)
#Save filtered dataframes for crystal structure mapping
df_E2_filter.to_csv(filtered_E2_binding_data,index=False)
df_E3_filter.to_csv(filtered_E3_binding_data,index=False)
#Now that they are filtered, merge EFNB2 and EFNB3
df_affinity_filter_merge = pd.merge(
df_E2_filter,
df_E3_filter,
on=['site','wildtype','mutant'],
suffixes=['_E2','_E3'],
how='outer'
)
#Add columns that calculate difference between EFNB2 and EFNB3 cell entry and EFNB2 and EFNB3 binding.
df_affinity_filter_merge['func_effect_diff'] = (df_affinity_filter_merge['effect_E2'] - df_affinity_filter_merge['effect_E3']).abs()
df_affinity_filter_merge['binding_effect_diff'] = (df_affinity_filter_merge['binding_mean_E2'] - df_affinity_filter_merge['binding_mean_E3']).abs()
#display stats
display(df_affinity_filter_merge[['binding_std_E2','binding_std_E3','binding_median_E2','binding_median_E3']].describe())
| binding_std_E2 | binding_std_E3 | binding_median_E2 | binding_median_E3 | |
|---|---|---|---|---|
| count | 6524.000000 | 6408.000000 | 6524.000000 | 6408.000000 |
| mean | 0.494191 | 0.177242 | -0.327325 | -0.017234 |
| std | 0.314740 | 0.168976 | 1.082116 | 0.269693 |
| min | 0.004012 | 0.000018 | -5.284000 | -1.960000 |
| 25% | 0.269950 | 0.059545 | -0.407250 | -0.148975 |
| 50% | 0.421400 | 0.132600 | -0.025265 | -0.014215 |
| 75% | 0.639225 | 0.242400 | 0.190425 | 0.115475 |
| max | 1.982000 | 1.795000 | 2.335000 | 2.008000 |
In [8]:
def overall_stats(df,effect,name):
#Find quantiles
quantile_2 = df['binding_median'].quantile(.02)
quantile_98 = df['binding_median'].quantile(.98)
print(f'The 2% quantile for {name} is: {quantile_2}')
print(f'The 98% quantile for {name} is: {quantile_98}')
#Now group sites and find intolerant sites
filtered_df = df.groupby('site').filter(lambda group: (group[effect] <-0.25).all())
unique = filtered_df['site'].unique()
# Convert unique to a Pandas Series
unique_series = pd.Series(unique)
#print(unique_series)
# Find the common elements
unique_contact_bool = unique_series.isin(config['contact_sites'])
#print(unique_contact_bool)
# Filter and get the common elements
common_elements = unique_series[unique_contact_bool]
# Print the common elements
print(f'Here are the contact sites that are conserved: {common_elements}')
print(f'There are {len(unique)} sites with all negative binding score mutants for {name}')
print(f'These are the sites for {name} with all negative binding score mutants: {list(unique)}')
#Now find sites with low and high binding (median)
median_df = df.groupby('site')['binding_median'].median().reset_index().sort_values(by='binding_median')
print(f'For {name}, these are the sites with lowest median binding scores: {median_df.head(5)}')
median_df = df.groupby('site')['binding_median'].median().reset_index().sort_values(by='binding_median',ascending=False)
print(f'For {name}, these are the sites with highest median binding scores: {median_df.head(5)}')
#Now calculate mutant number
total_mutants = df.shape[0]
upper_cutoff = df[df[effect] > 1].sort_values(by='binding_median',ascending=False)
median_upper = upper_cutoff['effect'].median()
print(f'The median entry score for top binders was: {median_upper}')
mutants_above_cutoff_tolerated = upper_cutoff[upper_cutoff['effect'] > 0]
mutants_above_cutoff_tolerated = mutants_above_cutoff_tolerated[['site','effect','binding_median','wildtype','mutant']]
print(f'The mutants with positive entry scores and good binding are: {mutants_above_cutoff_tolerated.head(5)}')
lower_cutoff = df[df[effect] < -1]
print(f'For {name}, there are a total of : {total_mutants} binding mutants')
print(f'For {name}, there are {upper_cutoff.shape[0]} mutants above cutoff, and {mutants_above_cutoff_tolerated.shape[0]} that have good entry scores')
print(f'For {name}, there are {lower_cutoff.shape[0]} mutants below cutoff')
total_sites = df['site'].unique().shape[0]
print(f'The total number of sites are: {total_sites}')
overall_stats(df_E2_filter,'binding_median','E2')
overall_stats(df_E3_filter,'binding_median','E3')
The 2% quantile for E2 is: -3.87716 The 98% quantile for E2 is: 1.10954 Here are the contact sites that are conserved: 5 242 11 389 23 488 24 490 25 491 29 501 30 504 31 505 33 531 34 532 35 533 36 557 37 579 38 581 41 588 dtype: int64 There are 44 sites with all negative binding score mutants for E2 These are the sites for E2 with all negative binding score mutants: [116, 206, 220, 236, 238, 242, 243, 248, 346, 351, 352, 389, 390, 398, 399, 400, 438, 439, 441, 460, 467, 486, 487, 488, 490, 491, 494, 495, 497, 501, 504, 505, 526, 531, 532, 533, 557, 579, 581, 584, 585, 588, 590, 594] For E2, these are the sites with lowest median binding scores: site binding_median 442 533 -4.0450 406 494 -4.0250 403 490 -3.9855 401 487 -3.8680 413 504 -3.8420 For E2, these are the sites with highest median binding scores: site binding_median 132 208 1.38740 48 120 1.36300 59 132 1.24100 131 207 1.20775 249 331 1.12100 The median entry score for top binders was: -0.76 The mutants with positive entry scores and good binding are: site effect binding_median wildtype mutant 1324 139 0.00505 1.989 N Y 5448 354 0.22810 1.498 S T 9533 566 0.08127 1.415 F H 7170 444 0.16280 1.256 I F 1217 134 0.09948 1.249 S I For E2, there are a total of : 6524 binding mutants For E2, there are 173 mutants above cutoff, and 22 that have good entry scores For E2, there are 953 mutants below cutoff The total number of sites are: 510 The 2% quantile for E3 is: -0.631902 The 98% quantile for E3 is: 0.6011319999999996 Here are the contact sites that are conserved: 3 389 5 488 9 501 10 531 11 532 dtype: int64 There are 12 sites with all negative binding score mutants for E3 These are the sites for E3 with all negative binding score mutants: [108, 140, 352, 389, 486, 488, 494, 495, 497, 501, 531, 532] For E3, these are the sites with lowest median binding scores: site binding_median 413 501 -0.91625 439 531 -0.72305 307 389 -0.68695 404 488 -0.64390 440 532 -0.63130 For E3, these are the sites with highest median binding scores: site binding_median 492 589 0.67100 84 159 0.56090 56 129 0.53290 59 132 0.46735 86 161 0.44775 The median entry score for top binders was: -0.7568 The mutants with positive entry scores and good binding are: site effect binding_median wildtype mutant 7995 492 0.49590 1.200 Q L 2632 211 0.03675 1.149 G F 10015 598 0.34370 1.141 P G 1655 161 0.23430 1.011 S E For E3, there are a total of : 6408 binding mutants For E3, there are 25 mutants above cutoff, and 4 that have good entry scores For E3, there are 11 mutants below cutoff The total number of sites are: 506
Find sites with opposite effects on binding¶
In [9]:
#find sites that are different
def find_biggest_differences(df):
df = df[['site','wildtype','mutant','binding_median_E2',
'LibA-231112-EFNB2-monomeric','LibA-231222-BA6-CHO-EFNB2-monomeric','LibB-231108-EFNB2-monomeric', 'LibB-231222-BA6-CHO-EFNB2-monomeric',
'effect_E2',
'binding_median_E3','LibA-230825-EFNB3-dimeric','LibB-230907-EFNB3-dimeric','effect_E3','effect_std_E3','times_seen_cell_entry_E3','binding_effect_diff']]
#,'binding_effect_diff']]
df = df[df['site'].isin(config['contact_sites'])]
efnb2_good_efnb3_bad = df[
(df['binding_median_E2'] > 0.1) &
(df['binding_median_E3'] < -0.1)
].sort_values(by='binding_median_E2',ascending=False)
display(efnb2_good_efnb3_bad)
efnb2_bad_efnb3_good = df[
(df['binding_median_E2'] < -0.1) &
(df['binding_median_E3'] > 0.1)
].sort_values(by='binding_median_E3',ascending=False)
display(efnb2_bad_efnb3_good)
find_biggest_differences(df_affinity_filter_merge)
| site | wildtype | mutant | binding_median_E2 | LibA-231112-EFNB2-monomeric | LibA-231222-BA6-CHO-EFNB2-monomeric | LibB-231108-EFNB2-monomeric | LibB-231222-BA6-CHO-EFNB2-monomeric | effect_E2 | binding_median_E3 | LibA-230825-EFNB3-dimeric | LibB-230907-EFNB3-dimeric | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | binding_effect_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1786 | 241 | S | L | 0.8667 | 1.0410 | 0.61050 | 0.6921 | 1.2300 | -0.4249 | -0.1359 | -0.1972 | -0.07473 | 0.15840 | 0.2774 | 6.571 | 1.02940 |
| 1781 | 241 | S | F | 0.6864 | 0.7802 | 0.58160 | 0.5925 | 1.7230 | -0.3853 | -0.1101 | -0.4214 | 0.20110 | -0.22970 | 0.2284 | 6.857 | 1.02940 |
| 1782 | 241 | S | G | 0.1692 | 0.2446 | 0.09392 | 0.3091 | -0.5815 | -0.3386 | -0.2740 | -0.1346 | -0.41340 | -0.09642 | 0.2263 | 5.857 | 0.29052 |
| site | wildtype | mutant | binding_median_E2 | LibA-231112-EFNB2-monomeric | LibA-231222-BA6-CHO-EFNB2-monomeric | LibB-231108-EFNB2-monomeric | LibB-231222-BA6-CHO-EFNB2-monomeric | effect_E2 | binding_median_E3 | LibA-230825-EFNB3-dimeric | LibB-230907-EFNB3-dimeric | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | binding_effect_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5149 | 492 | Q | L | -0.1671 | -0.06133 | -0.27290 | -0.73670 | 0.52310 | 0.51440 | 1.2000 | 1.22200 | 1.17800 | 0.4959 | 0.2963 | 4.857 | 1.3370 |
| 6360 | 588 | I | P | -2.2390 | -3.55400 | -1.65200 | -2.26100 | -2.21700 | -0.18690 | 1.0520 | 1.01300 | 1.09000 | -0.6374 | 0.3687 | 5.000 | 3.4730 |
| 5152 | 492 | Q | R | -2.9170 | -3.51500 | -2.60900 | -2.15100 | -3.22500 | 0.01257 | 0.6505 | 0.70400 | 0.59710 | 0.4092 | 0.2253 | 19.290 | 3.5255 |
| 5126 | 491 | S | A | -0.9052 | -1.08200 | -0.72840 | -0.40320 | -1.52200 | -0.59270 | 0.6041 | 0.57220 | 0.63610 | 0.2335 | 0.5233 | 6.143 | 1.5381 |
| 3924 | 402 | R | M | -0.4290 | -0.37740 | -0.15310 | -0.48070 | -0.82860 | -0.23740 | 0.3071 | 0.27330 | 0.34090 | -0.1438 | 0.2759 | 5.857 | 0.7670 |
| 6223 | 580 | I | E | -0.4390 | 0.88280 | -0.45010 | -0.42780 | -0.89910 | -0.57650 | 0.2554 | 0.15280 | 0.35790 | 0.5231 | 0.5856 | 5.143 | 0.4790 |
| 5980 | 559 | Q | L | -0.2814 | -1.42100 | -0.18540 | -0.37740 | -0.06623 | -0.77730 | 0.1812 | 0.17160 | 0.19090 | -0.5984 | 0.7335 | 4.571 | 0.6936 |
| 5119 | 490 | Q | L | -1.4730 | -1.50600 | -1.44000 | -1.57700 | -0.91850 | 0.41890 | 0.1462 | 0.33720 | -0.04486 | 0.5374 | 0.2450 | 5.857 | 1.5062 |
| 3744 | 388 | Q | Y | -0.1516 | -1.03200 | 0.04747 | 0.37960 | -0.35070 | 0.50160 | 0.1412 | 0.14140 | 0.14100 | 0.5493 | 0.0976 | 5.143 | 0.3800 |
| 3743 | 388 | Q | W | -0.2135 | -0.46200 | -0.06387 | -0.01505 | -0.36320 | 0.38560 | 0.1166 | 0.06678 | 0.16650 | -0.2053 | 0.2456 | 6.429 | 0.3426 |
| 1812 | 242 | R | W | -1.2840 | -1.73800 | -0.90460 | -0.37760 | -1.66300 | 0.09253 | 0.1141 | -0.01441 | 0.24250 | -0.4407 | 0.4435 | 6.286 | 1.2851 |
| 5300 | 507 | V | M | -1.3570 | -0.19970 | -0.99620 | -1.89000 | -1.71900 | 0.41880 | 0.1024 | 0.53630 | -0.33150 | -0.4414 | 0.5062 | 5.143 | 1.3034 |
Find correlations between EFNB2 and EFNB3 binding¶
In [10]:
def plot_affinity_solid(df):
df = df.dropna()
# calculate r value
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df['binding_median_E2'], df['binding_median_E3'])
r_value = float(r_value)
# make chart
chart = alt.Chart(df).mark_point(color='black',size=30, opacity=0.2,filled=True).encode(
x=alt.X('binding_median_E2', title=('EFNB2 Binding'), axis=alt.Axis(grid=True, tickCount=3)),
y=alt.Y('binding_median_E3', title=('EFNB3 Binding'), axis=alt.Axis(grid=True, tickCount=3)),
tooltip=['site', 'wildtype','mutant','binding_median_E2','binding_median_E3','effect_E2','effect_E3'],
).properties(
width=alt.Step(20),
height=alt.Step(20)
)
min = int(df['binding_median_E2'].min())
max = int(df['binding_median_E3'].max())
text = alt.Chart({'values':[{'x': min, 'y': max, 'text': f'r = {r_value:.2f}'}]}).mark_text(
align='left', baseline='top', dx=-10, dy=-20).encode(
x=alt.X('x:Q'),
y=alt.Y('y:Q'),
text='text:N'
)
chart_and_text = chart + text
return chart_and_text#.display()
E2_E3_corr = plot_affinity_solid(df_affinity_filter_merge)
E2_E3_corr.display()
E2_E3_corr.save(E2_E3_correlation)
Plot correlations between summary statistics for each site¶
In [11]:
def plot_affinity_solid_mean(df):
df = df.dropna()
means = df.groupby('site').agg({
'effect_E2': 'median',
'effect_E3': 'median',
'binding_median_E2': 'median',
'binding_median_E3': 'median',
'wildtype': 'first'
}).reset_index()
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(means['binding_median_E2'], means['binding_median_E3'])
r_value = float(r_value)
chart = alt.Chart(means).mark_point(size=30, color='black', opacity=0.3,filled=True).encode(
x=alt.X('binding_median_E2', title=('Median EFNB2 Binding'), axis=alt.Axis(grid=True, tickCount=3)),
y=alt.Y('binding_median_E3', title=('Median EFNB3 Binding'), axis=alt.Axis(grid=True, tickCount=3)),
tooltip=['site', 'wildtype','binding_median_E2','binding_median_E3','effect_E2','effect_E3'],
).properties(
width=alt.Step(20),
height=alt.Step(20)
)
text = alt.Chart({'values':[{'x': -3.5, 'y': 0.5, 'text': f'r = {r_value:.2f}'}]}).mark_text(
align='left', baseline='top', dx=0, dy=-10).encode(
x=alt.X('x:Q'),
y=alt.Y('y:Q'),
text='text:N'
)
chart_and_text = chart + text
return chart_and_text#.display()
E2_E3_site_corr = plot_affinity_solid_mean(df_affinity_filter_merge)
E2_E3_site_corr.display()
E2_E3_site_corr.save(E2_E3_correlation_site)
Make plot showing binding by site (median)¶
In [12]:
def plot_affinity_by_site_E2_median(df):
mean = df.groupby('site')['binding_median_E2'].median().reset_index()
chart = alt.Chart(mean).mark_point(size=60, color='black', opacity=0.3,filled=True).encode(
x=alt.X('site', title=('Site'), axis=alt.Axis(grid=True, tickCount=4),scale=alt.Scale(domain=[70,602])),
y=alt.Y('binding_median_E2', title=('EFNB2 Binding'), axis=alt.Axis(grid=True, tickCount=4)),
tooltip=['site'],
#color=alt.Color('effect_E2', title='EFNB2 Cell Entry', scale=alt.Scale(scheme='brownbluegreen',domainMid=0,domain=[-2,2])),
).properties(
width=600,
height=100
)
return chart.display()
def plot_affinity_by_site_E3_median(df):
mean = df.groupby('site')['binding_median_E3'].median().reset_index()
chart = alt.Chart(mean).mark_point(size=60, color='black', opacity=0.3,filled=True).encode(
x=alt.X('site', title=('Site'), axis=alt.Axis(grid=True, tickCount=6),scale=alt.Scale(domain=[70,602])),
y=alt.Y('binding_median_E3', title=('EFNB3 Binding'), axis=alt.Axis(grid=True, tickCount=4)),
tooltip=['site'],
#color=alt.Color('effect_E3',title='EFNB3 Cell Entry', scale=alt.Scale(scheme='brownbluegreen',domainMid=0,domain=[-2,2])),
).properties(
width=600,
height=100
)
return chart.display()
plot_affinity_by_site_E2_median(df_affinity_filter_merge)
plot_affinity_by_site_E3_median(df_affinity_filter_merge)
Make bubble plots for binding in different areas of receptor pocket¶
In [13]:
def make_boxplot_binding_region(df,title):# Create a box plot using Altair for aggregated means
barrel_ranges = {
'Hydrophobic': config['hydrophobic'],
'Salt Bridges': config['salt_bridges'],
'Hydrogen Bonds': config['h_bond_total'],
'Contact': config['contact_sites'],
'Overall': list(range(71,602)),
}
mean_df = df.groupby('site')[['binding_median']].median().reset_index()
custom_order = ['Hydrophobic','Salt Bridges','Hydrogen Bonds','Contact','Overall']
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = mean_df[mean_df['site'].isin(sites)]
for _, row in subset.iterrows():
agg_means.append({'barrel': barrel, 'effect': row['binding_median'],'site':row['site']})
agg_means_df = pd.DataFrame(agg_means)
chart = alt.Chart(agg_means_df).mark_point(filled=True,size=70,opacity=0.4,color='black').encode(
x=alt.X('barrel:O', sort=custom_order,title=None,axis=alt.Axis(labelAngle=-90)),
y=alt.Y('effect',title=f'Median {title} Binding',axis=alt.Axis(grid=True,tickCount=4)),
xOffset='random:Q',
color = alt.Color('barrel').legend(None),
tooltip=['barrel', 'effect','site'],
).transform_calculate(
random="sqrt(-1*log(random()))*cos(2*PI*random())"
).properties(
height=alt.Step(20),
width=alt.Step(25)
)
return chart.display()
make_boxplot_binding_region(df_E2_filter,'EFNB2')
make_boxplot_binding_region(df_E3_filter,'EFNB3')
Make lineplot by site showing mean binding affinity¶
Plot histogram¶
In [14]:
def effect_histogram(df):
colors = {'E2': '#1f4e79', 'E3': '#ff7f0e'}
all_charts = []
for effect in ['E2','E3']:
func_effect_container = []
for func_effect in [-2]:# Melt the dataframe for the specific columns
df = df[
(df[f'effect_{effect}'] > func_effect)
]
color = colors[effect]
df_melted = df.melt(value_vars=['binding_median_E2', 'binding_median_E3'], var_name='Effect', value_name='Value')
# Histogram for 'effect_E2'
histogram = alt.Chart(df_melted[df_melted['Effect'] == f'binding_median_{effect}']).mark_bar(opacity=1,color=color).encode(
x=alt.X('Value', bin=alt.Bin(step=0.1), title=f'Binding {effect}',axis=alt.Axis(tickCount=4,values=[3,1,0,-1,-5]),scale=alt.Scale(domain=[-6,3])),
y=alt.Y('count()',stack=None,title='Count'),
#color=alt.Color('red'),
tooltip=['Effect', 'count()']
).properties(
width=150,
height=alt.Step(10)
)
func_effect_container.append(histogram)
combined_effect_chart = alt.hconcat(*func_effect_container).resolve_scale(y='shared', x='shared', color='independent')
all_charts.append(combined_effect_chart)
final_combined_chart = alt.vconcat(*all_charts).resolve_scale(y='independent', x='independent', color='independent')
return final_combined_chart.display()
effect_histogram(df_affinity_filter_merge)
#effect_histogram(df_affinity_filter_merge,'#ff7f0e','binding_mean_E3')
Make binding score heatmaps¶
First, prep data for heatmap¶
In [15]:
entropy_df = pd.read_csv(config['entropy'])
entropy_df = entropy_df[['site','henipavirus_entropy']]
entropy_df = entropy_df.dropna(subset=['site'])
entropy_df['site'] = entropy_df['site'].astype('Int64')
entropy_df = entropy_df.rename(columns={'henipavirus_entropy':'entropy'})
entropy_df = entropy_df[['site','entropy']].drop_duplicates()
entropy_df['mutant'] = 'entropy'
entropy_df['wildtype'] = ''
entropy_df['effect'] = 'effect'
entropy_df.rename(columns={'entropy': 'value'}, inplace=True)
display(entropy_df.head(3))
def make_contact():
df = pd.DataFrame({
'site': config['contact_sites'],
'contact': [0.0] * len(config['contact_sites'])
})
df = df[['site','contact']]
df['mutant'] = 'contact'
df['wildtype'] = ''
df['effect'] = 'binding_median'
df.rename(columns={'contact':'value'}, inplace=True)
return df
contact_df = make_contact()
display(contact_df.head(3))
def make_empty_df_with_filtered(df,filter_df):
sites = range(71, 603)
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
# Create the combination of each site with each amino acid
data = [{'site': site, 'mutant': aa} for site in sites for aa in amino_acids]
# Create the DataFrame
empty_df = pd.DataFrame(data)
all_sites_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
df_melted = all_sites_df.melt(id_vars=['site', 'mutant', 'wildtype'],
value_vars=['binding_median'],
var_name='effect', value_name='value')
df_filtered = filter_df.melt(id_vars=['site', 'mutant', 'wildtype'],
value_vars=['effect'],
var_name='effect', value_name='value')
df_test = pd.concat([df_melted,df_filtered,contact_df],ignore_index=True)
return df_test
empty_df_E2 = make_empty_df_with_filtered(df_E2_filter,df_E2_filter_missing)
empty_df_E3 = make_empty_df_with_filtered(df_E3_filter,df_E3_filter_missing)
| site | value | mutant | wildtype | effect | |
|---|---|---|---|---|---|
| 0 | 1 | -0.00 | entropy | effect | |
| 1 | 2 | 1.75 | entropy | effect | |
| 2 | 3 | 1.75 | entropy | effect |
| site | value | mutant | wildtype | effect | |
|---|---|---|---|---|---|
| 0 | 239 | 0.0 | contact | binding_median | |
| 1 | 240 | 0.0 | contact | binding_median | |
| 2 | 241 | 0.0 | contact | binding_median |
In [16]:
def plot_heatmap_full(df,legend_title):
# Create y-axis order list
custom_order = ['contact','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C'] #custom_order = names + custom_order
final_df = df
full_ranges = [list(range(start, end)) for start, end in [(71, 204), (204, 337), (337, 470), (470, 603)]]
# Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
final_df = final_df.sort_values('site')
sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
# Now sort the dataframe by this rank
final_df = final_df.sort_values('mutant_rank')
# Drop the 'mutant_rank' column as it is no longer needed after sorting
final_df = final_df.drop(columns=['mutant_rank'])
sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
if legend_title == 'EFNB2 Binding':
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0,domain=[-6,2.5]) #EFNB3 is -1.5,2, EFNB2 is -6,3
else:
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0,domain=[-2,2]) #EFNB3 is -1.5,2, EFNB2 is -6,3
color_scale_entropy = alt.Scale(scheme='teals', domain=[0, 1],reverse=True)
color_scale_contact = alt.Scale(scheme='purples', domainMid=0, domain=[0, 5],reverse=True)
color_scale_features = alt.Scale(scheme='greys', domainMid=0, domain=[0, 5],reverse=True)
strokewidth_size = 0.25
effect_legend_added = False
charts = [] # container to hold the charts
for subset in full_ranges:
subset_df = final_df[final_df['site'].isin(subset)]
#display(subset_df)
unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype'])
# The chart for the heatmap
base = (
alt.Chart(subset_df)
.encode(
x=alt.X('site:O', title='Site', sort=sites, axis=alt.Axis(labelExpr="datum.value % 10 === 0 ? datum.value : ''",labelAngle=-90)),
y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
tooltip=['site','wildtype','mutant','value'],
)
.properties(
width=alt.Step(10),
height=alt.Step(11)
)
)
chart_empty = (
base.mark_rect(color='#d1d3d4').encode().transform_filter(
(alt.datum.effect == 'binding_median')
)
)
# Heatmap for effect
if not effect_legend_added:
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=legend_title)),
alt.value('transparent')),
).transform_filter(
alt.datum.effect == 'binding_median'
)
)
effect_legend_added = True
else:
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=None),
alt.value('transparent')),
).transform_filter(
alt.datum.effect == 'binding_median'
)
)
chart_filtered = (
base.mark_rect(color='#939598',stroke='black',strokeWidth = strokewidth_size).encode(
).transform_filter(
alt.datum.effect == 'effect'
)
)
chart_contact = (
base.mark_rect(color='black').encode(
#color=alt.condition(f'datum.mutant != "entropy"',
#alt.Color('value:Q', scale=color_scale_contact,legend=None),
#alt.value('transparent')),
).transform_filter(
alt.datum.mutant == 'contact'
)
)
# The layer for the wildtype boxes
wildtype_layer_box = (
alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# The layer for the wildtype amino acids
wildtype_layer = (
alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8, align='center', baseline='middle').encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# Combine the heatmap layer with the wildtype layer
chart = alt.layer(chart_empty, chart_effect,chart_filtered,chart_contact, wildtype_layer_box,wildtype_layer).resolve_scale(color='independent')
charts.append(chart)
combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='shared')
return combined_chart#.display()
E2_binding_plot = plot_heatmap_full(empty_df_E2,'EFNB2 Binding')
E2_binding_plot.save(E2_binding_heatmap)
E3_binding_plot = plot_heatmap_full(empty_df_E3,'EFNB3 Binding')
E3_binding_plot.save(E3_binding_heatmap)
Now prep data for contact site heatmaps¶
In [17]:
entropy_df = pd.read_csv(config['entropy'])
entropy_df = entropy_df[['site','henipavirus_entropy']]
entropy_df = entropy_df.dropna(subset=['site'])
entropy_df['site'] = entropy_df['site'].astype('Int64')
entropy_df = entropy_df.rename(columns={'henipavirus_entropy':'entropy'})
entropy_df = entropy_df[['site','entropy']].drop_duplicates()
entropy_df['mutant'] = 'entropy'
entropy_df['wildtype'] = ''
entropy_df['effect'] = 'effect'
entropy_df.rename(columns={'entropy': 'value'}, inplace=True)
#display(entropy_df.head(3))
def make_contact(list_,name_):
df = pd.DataFrame({
'site': list_,
name_: [0.0] * len(list_)
})
df = df[['site',name_]]
df['mutant'] = name_
df['wildtype'] = ''
df['effect'] = 'binding_median'
df.rename(columns={name_:'value'}, inplace=True)
return df
contact_df = make_contact(config['contact_sites'],'contact')
#hydrophobic_df = make_contact(hydrophobic,'hydrophobic')
#salt_bridge_df = make_contact(salt_bridges,'salt bridge')
#h_bond_df = make_contact(h_bond_total,'hydrogen bond')
#display(contact_df.head(3))
def make_empty_df_with_filtered(df,filtered_df):
sites = range(71, 603)
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
# Create the combination of each site with each amino acid
data = [{'site': site, 'mutant': aa} for site in sites for aa in amino_acids]
# Create the DataFrame
empty_df = pd.DataFrame(data)
all_sites_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
df_melted = all_sites_df.melt(id_vars=['site', 'mutant', 'wildtype'],
value_vars=['binding_median'],
var_name='effect', value_name='value')
df_filtered = filtered_df.melt(id_vars=['site', 'mutant', 'wildtype'],
value_vars=['effect'],
var_name='effect', value_name='value')
df_test = pd.concat([df_melted,df_filtered],ignore_index=True) #contact_df
return df_test
empty_df_E2 = make_empty_df_with_filtered(df_E2_filter,df_E2_filter_missing)
empty_df_E3 = make_empty_df_with_filtered(df_E3_filter,df_E3_filter_missing)
In [18]:
def plot_heatmap_contact(df,subset,name):
# Create y-axis order list
custom_order = ['R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C'] #custom_order = names + custom_order
final_df = df
full_ranges = [list(range(start, end)) for start, end in [(71, 204), (204, 337), (337, 470), (470, 603)]]
# Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
final_df = final_df.sort_values('site')
sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
# Now sort the dataframe by this rank
final_df = final_df.sort_values('mutant_rank')
# Drop the 'mutant_rank' column as it is no longer needed after sorting
final_df = final_df.drop(columns=['mutant_rank'])
sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
if name == 'EFNB2 Binding':
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0,domain=[-6,2.5])
else:
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0,domain=[-2,1.5])
color_scale_entropy = alt.Scale(scheme='teals', domain=[0, 1],reverse=True)
color_scale_contact = alt.Scale(scheme='teals', domainMid=0, domain=[0, 5],reverse=True)
color_scale_features = alt.Scale(scheme='greys', domainMid=0, domain=[0, 5],reverse=True)
subset_df = final_df[final_df['site'].isin(subset)]
unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype'])
strokewidth_size = 0.25
charts = [] # container to hold the charts
# The chart for the heatmap
base = (
alt.Chart(subset_df)
.encode(
x=alt.X('site:O', title='Contact Site', sort=sites, axis=alt.Axis(labelAngle=-90)),
y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
tooltip=['site','wildtype','mutant','value'],
)
.properties(
width=alt.Step(10),
height=alt.Step(11)
)
)
chart_empty = (
base.mark_rect(color='#e6e7e8').encode().transform_filter(
(alt.datum.effect == 'binding_median')
)
)
# Heatmap for effect
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=name)),
alt.value('transparent')),
).transform_filter(
alt.datum.effect == 'binding_median'
)
)
chart_filtered = (
base.mark_rect(color='#939598',stroke='black',strokeWidth = strokewidth_size).encode(
).transform_filter(
alt.datum.effect == 'effect'
)
)
chart_contact = (
base.mark_rect().encode(
color=alt.condition(f'datum.mutant != "entropy"',
alt.Color('value:Q', scale=color_scale_contact,legend=None),
alt.value('transparent')),
).transform_filter(
(alt.datum.mutant == 'hydrophobic') | (alt.datum.mutant == 'salt bridge') | (alt.datum.mutant == 'hydrogen bond')
)
)
# The layer for the wildtype boxes
wildtype_layer_box = (
alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# The layer for the wildtype amino acids
wildtype_layer = (
alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8, align='center', baseline='middle').encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# Combine the heatmap layer with the wildtype layer
chart = alt.layer(chart_empty, chart_effect,chart_filtered,wildtype_layer_box,wildtype_layer).resolve_scale(color='independent') #chart_contact,
charts.append(chart)
combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='shared')
return combined_chart.display()
plot_heatmap_contact(empty_df_E2,config['contact_sites'],'EFNB2 Binding')
plot_heatmap_contact(empty_df_E3,config['contact_sites'],'EFNB3 Binding')
Plot heatmap of sites with mutant effects different between amino acid types¶
In [19]:
def check_muts_by_properties(df,min_group_size,positive_value,negative_value):
letter_to_type = {'D': 'negative',
'E': 'negative',
'K': 'positive',
'R': 'positive',
'H': 'positive',
'C': 'special',
'S': 'hydrophilic',
'T': 'hydrophilic',
'N': 'hydrophilic',
'Q': 'hydrophilic',
'G': 'special',
'A': 'hydrophobic',
'V': 'hydrophobic',
'L': 'hydrophobic',
'I': 'hydrophobic',
'M': 'hydrophobic',
'P': 'special',
'F': 'aromatic',
'Y': 'armomatic',
'W': 'aromatic',
'*': 'stop'}
df['mutant_type'] = df['mutant'].map(letter_to_type)
#display(df)
#df = df[df['mutant_type'] != 'special']
#min_group_size = 3 # Set your minimum group size
grouped = df.groupby(['site', 'mutant_type'])
filtered_groups = grouped.filter(lambda x: (x['binding_median'] > positive_value).all() and len(x) >= min_group_size)
positive_mask = (
filtered_groups.groupby('site')['binding_median'].transform(lambda x: (x > positive_value).all()) &
df.groupby('site')['binding_median'].transform(lambda x: (x <= positive_value).any())
)
positive_sites = df[positive_mask]
filtered_groups_neg = grouped.filter(lambda x: (x['binding_median'] < negative_value).all() and len(x) >= min_group_size)
negative_mask = (
filtered_groups_neg.groupby('site')['binding_median'].transform(lambda x: (x < negative_value).all()) &
df.groupby('site')['binding_median'].transform(lambda x: (x <= negative_value).any())
)
negative_sites = df[negative_mask]
common_sites = pd.merge(positive_sites, negative_sites, on='site', how='inner')
final_sites = common_sites['site'].drop_duplicates()
return final_sites
final_sites_E2 = check_muts_by_properties(df_E2_filter,1,0,-0.5)
final_sites_E3 = check_muts_by_properties(df_E3_filter,1,0,-0.5)
print(list(final_sites_E2))
[82, 153, 169, 202, 214, 215, 232, 235, 241, 244, 245, 250, 268, 271, 284, 305, 309, 310, 319, 337, 347, 362, 378, 388, 394, 404, 406, 409, 440, 452, 458, 463, 479, 482, 508, 511, 525, 529, 555, 568, 577, 578, 591, 593]
In [20]:
def plot_heatmap_pref_switch(df,subset,name):
# Create y-axis order list
custom_order = ['contact','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C'] #custom_order = names + custom_order
final_df = df
full_ranges = [list(range(start, end)) for start, end in [(71, 204), (204, 337), (337, 470), (470, 603)]]
# Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
final_df = final_df.sort_values('site')
sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
# Now sort the dataframe by this rank
final_df = final_df.sort_values('mutant_rank')
# Drop the 'mutant_rank' column as it is no longer needed after sorting
final_df = final_df.drop(columns=['mutant_rank'])
sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
if name == 'EFNB2 Binding':
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0) #EFNB3 is -1.5,2, EFNB2 is -6,3
else:
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0) #EFNB3 is -1.5,2, EFNB2 is -6,3
color_scale_entropy = alt.Scale(scheme='teals', domain=[0, 1],reverse=True)
color_scale_contact = alt.Scale(scheme='teals', domainMid=0, domain=[0, 5],reverse=True)
color_scale_features = alt.Scale(scheme='greys', domainMid=0, domain=[0, 5],reverse=True)
subset_df = final_df[final_df['site'].isin(subset)]
unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype'])
strokewidth_size = 0.25
charts = [] # container to hold the charts
# The chart for the heatmap
base = (
alt.Chart(subset_df)
.encode(
x=alt.X('site:O', title=None, sort=sites, axis=alt.Axis(labelAngle=-90)),
y=alt.Y('mutant', title=None, sort=alt.EncodingSortField(field='sort_order', order='ascending')),
tooltip=['site','wildtype','mutant','value'],
)
.properties(
width=alt.Step(10),
height=alt.Step(11)
)
)
chart_empty = (
base.mark_rect(color='#e6e7e8').encode().transform_filter(
(alt.datum.effect == 'binding_median')
)
)
# Heatmap for effect
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=name)),
alt.value('transparent')),
).transform_filter(
alt.datum.effect == 'binding_median'
)
)
chart_filtered = (
base.mark_rect(color='#939598',stroke='black',strokeWidth = strokewidth_size).encode(
).transform_filter(
alt.datum.effect == 'effect'
)
)
chart_contact = (
base.mark_rect().encode(
color=alt.condition(f'datum.mutant != "entropy"',
alt.Color('value:Q', scale=color_scale_contact,legend=None),
alt.value('transparent')),
).transform_filter(
(alt.datum.mutant == 'hydrophobic') | (alt.datum.mutant == 'salt bridge') | (alt.datum.mutant == 'hydrogen bond')
)
)
# The layer for the wildtype boxes
wildtype_layer_box = (
alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# The layer for the wildtype amino acids
wildtype_layer = (
alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8, align='center', baseline='middle').encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# Combine the heatmap layer with the wildtype layer
chart = alt.layer(chart_empty, chart_effect,chart_filtered,chart_contact, wildtype_layer_box,wildtype_layer).resolve_scale(color='independent')
charts.append(chart)
combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='shared')
return combined_chart.display()
cysteine_neck = [146, 158, 162]
cysteine_1 = [189,601]
cysteine_2 = [216, 240]
cysteine_3 = [282,295]
cysteine_4 = [382, 395]
cysteine_5 = [387, 499]
cysteine_6 = [493, 503]
cysteine_7 = [565, 574]
cysteine = cysteine_neck + cysteine_1 + cysteine_2 + cysteine_3 + cysteine_4 + cysteine_5 + cysteine_6 + cysteine_7
#print(cysteine)
n_linked = [159,306,378,417,481,529]
#n_linked_removal = [161,308,380,419,483,531]
stalk = list(range(96, 147))
neck = list(range(148,166))
linker = list(range(166,177))
#PNLG = [164, 180, 187, 191, 195, 275, 288, 311, 326, 359, 403, 423, 430, 473, 478, 543, 554, 570, 600]
total_list = [n_linked,stalk, neck, linker,cysteine]
for i in total_list:
plot_heatmap_pref_switch(empty_df_E2,i,'EFNB2 Binding')
plot_heatmap_pref_switch(empty_df_E3,i,'EFNB3 Binding')
Now plot heatmaps depending on wildtype amino acid property¶
In [21]:
hydrophobic_AA = ['A','V','L','I','M']
aromatic_AA = ['Y','W','F']
positive_AA = ['K','R','H']
negative_AA = ['E','D']
hydrophilic_AA = ['S','T','N','Q']
def find_aa_wildtype_sites(df,aa_type):
aa_list = list(df[df['wildtype'].isin(aa_type)]['site'].unique())
return aa_list
hydrophobic_AA_list = find_aa_wildtype_sites(df_E2_filter,hydrophobic_AA)
aromatic_AA_list = find_aa_wildtype_sites(df_E2_filter,aromatic_AA)
positive_AA_list = find_aa_wildtype_sites(df_E2_filter,positive_AA)
negative_AA_list = find_aa_wildtype_sites(df_E2_filter,negative_AA)
hydrophilic_AA_list = find_aa_wildtype_sites(df_E2_filter,hydrophilic_AA)
all_AA = [hydrophobic_AA_list,aromatic_AA_list,positive_AA_list,negative_AA_list,hydrophilic_AA_list]
for i in all_AA:
plot_heatmap_pref_switch(empty_df_E2,i,'EFNB2 Binding')
plot_heatmap_pref_switch(empty_df_E3,i,'EFNB3 Binding')
In [ ]: